#Read dataset
#Dataset with outliers
BostonCondo_OT <- read.csv("Boston_Condo_w_Outliers.csv", stringsAsFactors = F)
#Dataset without outliers
BostonCondo_No_OT <- read.csv("Boston_Condo_no_Outliers.csv", stringsAsFactors = F)
BostonCondo_OT <- BostonCondo_OT[,-1]
BostonCondo_No_OT <- BostonCondo_No_OT[,-1]
BostonCondo_OT$ZIPCODE <-as.character(BostonCondo_OT$ZIPCODE)
BostonCondo_No_OT$ZIPCODE <- as.character(BostonCondo_No_OT$ZIPCODE)
str(BostonCondo_OT)
## 'data.frame':    65198 obs. of  25 variables:
##  $ ZIPCODE      : chr  "2128" "2128" "2128" "2110" ...
##  $ OWN_OCC      : chr  "Y" "Y" "N" "N" ...
##  $ AV_BLDG      : int  364700 373400 394400 639100 690200 956800 790900 606400 547100 380800 ...
##  $ YR_BUILT     : int  1900 1900 1900 1972 1972 1972 1972 1972 1972 1960 ...
##  $ YR_REMOD     : int  2018 2018 2018 0 1977 2005 0 2017 0 2009 ...
##  $ GROSS_AREA   : int  791 799 908 744 868 1636 1210 880 756 952 ...
##  $ NUM_FLOORS   : num  1 1 1 1 1 1 1 1 1 2 ...
##  $ U_BASE_FLOOR : int  1 2 3 7 7 7 7 7 8 1 ...
##  $ U_NUM_PARK   : int  0 0 0 0 1 0 0 0 0 1 ...
##  $ U_CORNER     : chr  "N - No" "N - No" "N - No" "N - No" ...
##  $ U_ORIENT     : chr  "T - Through" "T - Through" "T - Through" "T - Through" ...
##  $ U_BDRMS      : int  2 3 3 2 1 2 2 1 1 2 ...
##  $ U_FULL_BTH   : int  1 1 1 1 1 2 1 1 1 1 ...
##  $ U_HALF_BTH   : int  0 0 0 0 0 1 1 0 0 0 ...
##  $ U_BTH_STYLE  : chr  "M - Modern" "M - Modern" "M - Modern" "L - Luxury" ...
##  $ U_BTH_STYLE2 : chr  "No" "No" "No" "No" ...
##  $ U_BTH_STYLE3 : chr  "No" "No" "No" "No" ...
##  $ U_KITCH_TYPE : chr  "F - Full Eat In" "F - Full Eat In" "F - Full Eat In" "O - One Person" ...
##  $ U_KITCH_STYLE: chr  "M - Modern" "M - Modern" "M - Modern" "L - Luxury" ...
##  $ U_HEAT_TYP   : chr  "W - Ht Water/Steam" "W - Ht Water/Steam" "W - Ht Water/Steam" "W - Ht Water/Steam" ...
##  $ U_AC         : chr  "N - None" "N - None" "N - None" "C - Central AC" ...
##  $ U_FPLACE     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ U_INT_FIN    : chr  "N - Normal" "N - Normal" "N - Normal" "N - Normal" ...
##  $ U_INT_CND    : chr  "G - Good" "G - Good" "G - Good" "E - Excellent" ...
##  $ U_VIEW       : chr  "A - Average" "A - Average" "A - Average" "G - Good" ...
str(BostonCondo_No_OT)
## 'data.frame':    58105 obs. of  25 variables:
##  $ ZIPCODE      : chr  "2128" "2128" "2128" "2110" ...
##  $ OWN_OCC      : chr  "Y" "Y" "N" "N" ...
##  $ AV_BLDG      : int  364700 373400 394400 639100 690200 956800 790900 606400 547100 380800 ...
##  $ YR_BUILT     : int  1900 1900 1900 1972 1972 1972 1972 1972 1972 1960 ...
##  $ YR_REMOD     : int  2018 2018 2018 0 1977 2005 0 2017 0 2009 ...
##  $ GROSS_AREA   : int  791 799 908 744 868 1636 1210 880 756 952 ...
##  $ NUM_FLOORS   : num  1 1 1 1 1 1 1 1 1 2 ...
##  $ U_BASE_FLOOR : int  1 2 3 7 7 7 7 7 8 1 ...
##  $ U_NUM_PARK   : int  0 0 0 0 1 0 0 0 0 1 ...
##  $ U_CORNER     : chr  "N - No" "N - No" "N - No" "N - No" ...
##  $ U_ORIENT     : chr  "T - Through" "T - Through" "T - Through" "T - Through" ...
##  $ U_BDRMS      : int  2 3 3 2 1 2 2 1 1 2 ...
##  $ U_FULL_BTH   : int  1 1 1 1 1 2 1 1 1 1 ...
##  $ U_HALF_BTH   : int  0 0 0 0 0 1 1 0 0 0 ...
##  $ U_BTH_STYLE  : chr  "M - Modern" "M - Modern" "M - Modern" "L - Luxury" ...
##  $ U_BTH_STYLE2 : chr  "No" "No" "No" "No" ...
##  $ U_BTH_STYLE3 : chr  "No" "No" "No" "No" ...
##  $ U_KITCH_TYPE : chr  "F - Full Eat In" "F - Full Eat In" "F - Full Eat In" "O - One Person" ...
##  $ U_KITCH_STYLE: chr  "M - Modern" "M - Modern" "M - Modern" "L - Luxury" ...
##  $ U_HEAT_TYP   : chr  "W - Ht Water/Steam" "W - Ht Water/Steam" "W - Ht Water/Steam" "W - Ht Water/Steam" ...
##  $ U_AC         : chr  "N - None" "N - None" "N - None" "C - Central AC" ...
##  $ U_FPLACE     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ U_INT_FIN    : chr  "N - Normal" "N - Normal" "N - Normal" "N - Normal" ...
##  $ U_INT_CND    : chr  "G - Good" "G - Good" "G - Good" "E - Excellent" ...
##  $ U_VIEW       : chr  "A - Average" "A - Average" "A - Average" "G - Good" ...
library(caret)
## Loading required package: lattice
## Loading required package: ggplot2

Scenario 1: With outliers

#Split the dataset into training and testing (90% Training, 10% Testing)
#folds1 <- createFolds(BostonCondo_OT$AV_BLDG)
#
#for (f in folds1){
#  train1 <- BostonCondo_OT[-f,]
#  test1 <- BostonCondo_OT[f,]
#}
#save training & testing dataset
#write.csv(train1, "train1.csv")
#wirte.csv(test1, "test1.csv")
#Read dataset
train1 <- read.csv("train1.csv", stringsAsFactors = F)
test1 <- read.csv("test1.csv", stringsAsFactors = F)
train1 <- train1[,-1]
test1 <- test1[,-1]
train1$ZIPCODE <-as.character(train1$ZIPCODE)
test1$ZIPCODE <- as.character(test1$ZIPCODE)
#multi linear regression - 5 fold cross validation
startime <- Sys.time()
control1_1 <- trainControl(method="cv", number = 10)
mlr1 <- train(AV_BLDG~., data = train1, method ="lm", trControl=control1_1)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
endtime <- Sys.time()
endtime - startime
## Time difference of 18.07396 secs
mlr1
## Linear Regression 
## 
## 58678 samples
##    24 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 52810, 52810, 52810, 52811, 52811, 52811, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   339103.4  0.8028159  177918.2
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(mlr1)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -12294752   -110577     15471    118242  23401773 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       2.016e+06  3.532e+05   5.708 1.15e-08 ***
## ZIPCODE2109                      -1.702e+05  1.530e+04 -11.122  < 2e-16 ***
## ZIPCODE2110                      -2.902e+05  1.540e+04 -18.848  < 2e-16 ***
## ZIPCODE2111                      -3.277e+05  1.432e+04 -22.883  < 2e-16 ***
## ZIPCODE2113                      -2.211e+05  1.534e+04 -14.411  < 2e-16 ***
## ZIPCODE2114                      -1.920e+05  1.301e+04 -14.764  < 2e-16 ***
## ZIPCODE2115                      -1.474e+05  1.287e+04 -11.454  < 2e-16 ***
## ZIPCODE2116                      -1.792e+03  1.188e+04  -0.151 0.880121    
## ZIPCODE2118                      -1.531e+05  1.242e+04 -12.320  < 2e-16 ***
## ZIPCODE2119                      -6.929e+05  1.632e+04 -42.448  < 2e-16 ***
## ZIPCODE2120                      -5.858e+05  2.090e+04 -28.035  < 2e-16 ***
## ZIPCODE2121                      -7.138e+05  2.144e+04 -33.294  < 2e-16 ***
## ZIPCODE2122                      -7.012e+05  1.500e+04 -46.746  < 2e-16 ***
## ZIPCODE2124                      -7.094e+05  1.504e+04 -47.163  < 2e-16 ***
## ZIPCODE2125                      -6.241e+05  1.429e+04 -43.661  < 2e-16 ***
## ZIPCODE2126                      -6.897e+05  2.413e+04 -28.585  < 2e-16 ***
## ZIPCODE2127                      -4.127e+05  1.226e+04 -33.672  < 2e-16 ***
## ZIPCODE2128                      -5.327e+05  1.406e+04 -37.898  < 2e-16 ***
## ZIPCODE2129                      -4.000e+05  1.286e+04 -31.109  < 2e-16 ***
## ZIPCODE2130                      -6.188e+05  1.276e+04 -48.508  < 2e-16 ***
## ZIPCODE2131                      -7.398e+05  1.455e+04 -50.835  < 2e-16 ***
## ZIPCODE2132                      -6.654e+05  1.573e+04 -42.302  < 2e-16 ***
## ZIPCODE2134                      -4.702e+05  1.396e+04 -33.672  < 2e-16 ***
## ZIPCODE2135                      -4.641e+05  1.254e+04 -37.010  < 2e-16 ***
## ZIPCODE2136                      -7.854e+05  1.753e+04 -44.797  < 2e-16 ***
## ZIPCODE2199                       2.944e+06  8.676e+04  33.938  < 2e-16 ***
## ZIPCODE2210                      -1.576e+05  1.712e+04  -9.206  < 2e-16 ***
## ZIPCODE2215                      -2.113e+05  1.331e+04 -15.880  < 2e-16 ***
## ZIPCODE2446                      -7.540e+05  2.411e+05  -3.127 0.001766 ** 
## ZIPCODE2467                      -7.703e+05  1.753e+04 -43.952  < 2e-16 ***
## OWN_OCCY                         -2.536e+04  3.032e+03  -8.367  < 2e-16 ***
## YR_BUILT                         -6.186e+02  4.621e+01 -13.387  < 2e-16 ***
## YR_REMOD                          4.690e+00  2.068e+00   2.268 0.023329 *  
## GROSS_AREA                        7.639e+02  4.781e+00 159.777  < 2e-16 ***
## NUM_FLOORS                       -1.664e+05  3.328e+03 -49.982  < 2e-16 ***
## U_BASE_FLOOR                     -1.741e+03  5.022e+02  -3.467 0.000526 ***
## U_NUM_PARK                        3.930e+04  2.573e+03  15.271  < 2e-16 ***
## `U_CORNERY - Yes`                 4.794e+04  3.940e+03  12.167  < 2e-16 ***
## `U_ORIENTB - Rear Below`         -1.759e+04  1.176e+04  -1.496 0.134579    
## `U_ORIENTC - Courtyard`          -1.435e+04  9.630e+03  -1.490 0.136197    
## `U_ORIENTE - End`                -1.629e+04  3.081e+04  -0.529 0.596893    
## `U_ORIENTF - Front/Street`       -2.361e+04  4.748e+03  -4.973 6.62e-07 ***
## `U_ORIENTM - Middle`              1.434e+04  8.638e+03   1.660 0.096897 .  
## `U_ORIENTT - Through`             1.461e+03  4.816e+03   0.303 0.761593    
## U_BDRMS                          -4.568e+04  2.547e+03 -17.937  < 2e-16 ***
## U_FULL_BTH                        3.456e+05  6.989e+03  49.441  < 2e-16 ***
## U_HALF_BTH                        3.268e+05  7.760e+03  42.107  < 2e-16 ***
## `U_BTH_STYLEM - Modern`          -1.495e+05  1.076e+04 -13.889  < 2e-16 ***
## `U_BTH_STYLEN - No Remodeling`   -1.346e+05  1.854e+04  -7.261 3.88e-13 ***
## `U_BTH_STYLES - Semi-Modern`     -1.484e+05  1.227e+04 -12.095  < 2e-16 ***
## `U_BTH_STYLE2M - Modern`         -2.351e+03  1.300e+04  -0.181 0.856452    
## `U_BTH_STYLE2N - No Remodeling`  -7.496e+04  2.758e+04  -2.718 0.006578 ** 
## U_BTH_STYLE2No                    3.350e+05  1.442e+04  23.240  < 2e-16 ***
## `U_BTH_STYLE2S - Semi-Modern`    -6.306e+04  1.408e+04  -4.480 7.49e-06 ***
## `U_BTH_STYLE3M - Modern`         -5.379e+05  1.579e+04 -34.060  < 2e-16 ***
## `U_BTH_STYLE3N - No Remodeling`  -4.993e+05  4.939e+04 -10.109  < 2e-16 ***
## U_BTH_STYLE3No                   -3.384e+05  1.697e+04 -19.946  < 2e-16 ***
## `U_BTH_STYLE3S - Semi-Modern`    -6.664e+05  1.874e+04 -35.563  < 2e-16 ***
## `U_KITCH_TYPEF - Full Eat In`    -6.907e+04  3.407e+05  -0.203 0.839326    
## `U_KITCH_TYPEN - None`           -2.103e+05  3.424e+05  -0.614 0.539202    
## `U_KITCH_TYPEO - One Person`     -9.831e+04  3.407e+05  -0.289 0.772897    
## `U_KITCH_TYPEP - Pullman`        -1.209e+05  3.407e+05  -0.355 0.722736    
## `U_KITCH_STYLEM - Modern`        -9.153e+04  9.522e+03  -9.613  < 2e-16 ***
## `U_KITCH_STYLEN - No Remodeling` -1.371e+05  1.796e+04  -7.634 2.31e-14 ***
## `U_KITCH_STYLES - Semi-Modern`   -1.147e+05  1.111e+04 -10.322  < 2e-16 ***
## `U_HEAT_TYPF - Forced Hot Air`   -2.542e+04  6.323e+03  -4.021 5.81e-05 ***
## `U_HEAT_TYPN - None`             -9.535e+05  1.720e+05  -5.543 2.98e-08 ***
## `U_HEAT_TYPO - Other`             4.283e+05  9.135e+04   4.688 2.76e-06 ***
## `U_HEAT_TYPP - Heat Pump`        -2.152e+04  8.387e+03  -2.566 0.010288 *  
## `U_HEAT_TYPS - Space Heat`       -6.625e+03  3.853e+04  -0.172 0.863491    
## `U_HEAT_TYPW - Ht Water/Steam`   -4.602e+04  5.856e+03  -7.859 3.94e-15 ***
## `U_ACD - Ductless AC`             1.645e+04  2.390e+04   0.688 0.491260    
## `U_ACN - None`                    1.645e+04  4.286e+03   3.838 0.000124 ***
## U_FPLACE                          6.332e+04  2.893e+03  21.886  < 2e-16 ***
## `U_INT_FING - Good`              -3.577e+05  1.395e+05  -2.564 0.010348 *  
## `U_INT_FINN - Normal`            -3.555e+05  8.434e+03 -42.154  < 2e-16 ***
## `U_INT_FINS - Substandard`       -3.944e+05  5.006e+04  -7.879 3.37e-15 ***
## `U_INT_CNDE - Excellent`          9.724e+04  7.245e+03  13.420  < 2e-16 ***
## `U_INT_CNDF - Fair`              -3.675e+04  2.497e+04  -1.472 0.141047    
## `U_INT_CNDG - Good`              -3.797e+03  4.352e+03  -0.872 0.383016    
## `U_INT_CNDP - Poor`              -1.319e+05  7.326e+04  -1.800 0.071815 .  
## `U_VIEWE - Excellent`             3.426e+05  8.502e+03  40.297  < 2e-16 ***
## `U_VIEWF - Fair`                 -4.322e+04  7.232e+03  -5.976 2.30e-09 ***
## `U_VIEWG - Good`                  4.133e+04  4.526e+03   9.132  < 2e-16 ***
## `U_VIEWP - Poor`                 -1.673e+05  2.657e+04  -6.296 3.07e-10 ***
## `U_VIEWS - Special`               1.605e+06  2.769e+04  57.948  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 340500 on 58592 degrees of freedom
## Multiple R-squared:  0.8016, Adjusted R-squared:  0.8013 
## F-statistic:  2785 on 85 and 58592 DF,  p-value: < 2.2e-16
predict_mlr1 <- predict(mlr1, test1)
#Calculate the performance metrics for the prediction on testing set
# R2 is R-Squared. It measures how much the variance in the dependent variable which is explained by the model
# RMSE is root mean squared error. It measures the standard deviation of residuals
#MAE is Mean absolute error, the average of the absolute difference between the actual and predicted values in the dataset. It measures the average of the residuals in the dataset
data.frame( R2_1_1 = R2(predict_mlr1, test1$AV_BLDG),
            RMSE_1_1 = RMSE(predict_mlr1, test1$AV_BLDG),
            MAE_1_1 = MAE(predict_mlr1, test1$AV_BLDG))
##      R2_1_1 RMSE_1_1  MAE_1_1
## 1 0.7816596 350471.3 178332.3
x= 1:length(test1$AV_BLDG)

plot(x, test1$AV_BLDG, col="red", ylab = "Value", type="p",main="Boston Condo Property Assessment - Actual Value")

plot(x, predict_mlr1, col='blue', ylab = "Value", type="p", main="Boston Condo Property Assessment - Predicted Value")

plot(x, test1$AV_BLDG, col="red", ylab = "Value", type="p",main="Boston Condo Property Assessment")
par(new=T)
plot(x, predict_mlr1, col='blue', ylab = "", type="p")
legend("topright", legend=c("Actual Value", "Predicted Value"), fill = c("red","blue"), col=2:3, adj=c(0,0.6))
grid()

plot(predict_mlr1, test1$AV_BLDG, xlab = "preidtced value", ylab = "actual value")
abline(a=0,b=1)

library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
## 
##     margin
#random forest with 5 trees
startime <- Sys.time()
control1_2 <- trainControl(method="cv", number = 10, search = "grid")
rf1_1 <- train(AV_BLDG~., data = train1, method ="rf", ntree=5, trControl=control1_2)
endtime <- Sys.time()
endtime - startime
## Time difference of 23.68411 mins
rf1_1
## Random Forest 
## 
## 58678 samples
##    24 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 52810, 52810, 52810, 52810, 52810, 52811, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE      
##    2    450899.2  0.7080347  210605.40
##   43    223102.4  0.9138811   78101.69
##   85    220741.8  0.9162428   76171.78
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 85.
plot(rf1_1)

#preidct on test set
predict_rf1_1 <- predict(rf1_1, test1)
#Show performance metrics on test set
data.frame( R2_1_2 = R2(predict_rf1_1, test1$AV_BLDG),
            RMSE_1_2 = RMSE(predict_rf1_1, test1$AV_BLDG),
            MAE_1_2 = MAE(predict_rf1_1, test1$AV_BLDG))
##      R2_1_2 RMSE_1_2  MAE_1_2
## 1 0.9243845 205950.6 75942.36
#visualize the results of actual value and predicted value
x= 1:length(test1$AV_BLDG)

plot(x, test1$AV_BLDG, col="red", ylab = "Value", type="p",main="Boston Condo Property Assessment - Actual Value")

plot(x, predict_rf1_1, col='blue', ylab = "Value", type="p", main="Boston Condo Property Assessment - Predicted Value")

plot(x, test1$AV_BLDG, col="red", ylab = "Value", type="p",main="Boston Condo Property Assessment")
par(new=T)
plot(x, predict_rf1_1, col='blue', ylab = "", type="p")
legend("topright", legend=c("Actual Value", "Predicted Value"), fill = c("red","blue"), col=2:3, adj=c(0,0.6))
grid()

#Visualize how close predicted value and actual value are
plot(predict_rf1_1, test1$AV_BLDG, xlab = "preidtced value", ylab = "actual value")
abline(a=0,b=1)

#random forest with 10 trees
startime <- Sys.time()
rf1_2 <- train(AV_BLDG~., data = train1, method ="rf", ntree=10, trControl=control1_2)
endtime <- Sys.time()
endtime - startime
## Time difference of 46.81932 mins
rf1_2
## Random Forest 
## 
## 58678 samples
##    24 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 52810, 52811, 52810, 52811, 52810, 52810, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE      Rsquared   MAE      
##    2    441245.2  0.7428750  201307.93
##   43    205585.3  0.9255190   72337.94
##   85    201065.4  0.9296036   71580.06
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 85.
plot(rf1_2)

predict_rf1_2 <- predict(rf1_2, test1)
data.frame( R2_1_3 = R2(predict_rf1_2, test1$AV_BLDG),
            RMSE_1_3 = RMSE(predict_rf1_2, test1$AV_BLDG),
            MAE_1_3 = MAE(predict_rf1_2, test1$AV_BLDG))
##      R2_1_3 RMSE_1_3  MAE_1_3
## 1 0.9431407 179246.8 69571.51
#visualize the results of actual value and predicted value
x= 1:length(test1$AV_BLDG)

plot(x, test1$AV_BLDG, col="red", ylab = "Value", type="p",main="Boston Condo Property Assessment - Actual Value")

plot(x, predict_rf1_2, col='blue', ylab = "Value", type="p", main="Boston Condo Property Assessment - Predicted Value")

plot(x, test1$AV_BLDG, col="red", ylab = "Value", type="p",main="Boston Condo Property Assessment")
par(new=T)
plot(x, predict_rf1_2, col='blue', ylab = "", type="p")
legend("topright", legend=c("Actual Value", "Predicted Value"), fill = c("red","blue"), col=2:3, adj=c(0,0.6))
grid()

#Visualize how close predicted value and actual value are
plot(predict_rf1_2, test1$AV_BLDG, xlab = "preidtced value", ylab = "actual value")
abline(a=0,b=1)

library(class)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:randomForest':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#prepare data for k nearest neighbor regression
x_train1 = train1
x_test1 = test1
#onehot encoding
dmy1 <- dummyVars(" ~ .", data = x_train1)
x_train1 <- data.frame(predict(dmy1, newdata = x_train1))
dmy2 <- dummyVars(" ~ .", data = x_test1)
x_test1 <- data.frame(predict(dmy2, newdata = x_test1))
#plug 0 value into attributes missing from test set after one hot encoding 
x_test1$U_HEAT_TYPN...None <- 0
x_test1 <- x_test1 %>% relocate(U_HEAT_TYPN...None, .after = U_HEAT_TYPF...Forced.Hot.Air)
#k nearest regression
startime <- Sys.time()
knn1 <- train(AV_BLDG~., data = x_train1, method ="knn", trControl=control1_1, tuneGrid = expand.grid(k=1:2))
endtime <- Sys.time()
endtime - startime
## Time difference of 45.7329 mins
knn1
## k-Nearest Neighbors 
## 
## 58678 samples
##    99 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 52811, 52811, 52809, 52810, 52809, 52811, ... 
## Resampling results across tuning parameters:
## 
##   k  RMSE      Rsquared   MAE     
##   1  528408.8  0.5844252  185982.8
##   2  470666.3  0.6367171  186668.5
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 2.
predict_knn1 <- predict(knn1, x_test1)
data.frame( R2_1_4 = R2(predict_knn1, x_test1$AV_BLDG),
            RMSE_1_4 = RMSE(predict_knn1, x_test1$AV_BLDG),
            MAE_1_4 = MAE(predict_knn1, x_test1$AV_BLDG))
##      R2_1_4 RMSE_1_4  MAE_1_4
## 1 0.6155162   478633 184474.3
x= 1:length(x_test1$AV_BLDG)

plot(x, x_test1$AV_BLDG, col="red", ylab = "Value", type="p",main="Boston Condo Property Assessment - Actual Value")

plot(x, predict_knn1, col='blue', ylab = "Value", type="p", main="Boston Condo Property Assessment - Predicted Value")

plot(x, x_test1$AV_BLDG, col="red", ylab = "Value", type="p",main="Boston Condo Property Assessment")
par(new=T)
plot(x, predict_knn1, col='blue', ylab = "", type="p")
legend("topright", legend=c("Actual Value", "Predicted Value"), fill = c("red","blue"), col=2:3, adj=c(0,0.6))
grid()

plot(predict_knn1, x_test1$AV_BLDG, xlab = "preidtced value", ylab = "actual value")
abline(a=0,b=1)

Scenario 2. Without Outliers in training set

#remove rows including outliers
train2 <- train1[train1$AV_BLDG %in% BostonCondo_No_OT$AV_BLDG,]
#multi linear regression
startime <- Sys.time()
control2_1 <- trainControl(method="cv", number = 10)
mlr2 <- train(AV_BLDG~., data = train2, method ="lm", trControl=control2_1)
## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading

## Warning in predict.lm(modelFit, newdata): prediction from a rank-deficient fit
## may be misleading
endtime <- Sys.time()
endtime - startime
## Time difference of 14.07458 secs
mlr2
## Linear Regression 
## 
## 53351 samples
##    24 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 48015, 48017, 48016, 48015, 48015, 48015, ... 
## Resampling results:
## 
##   RMSE      Rsquared   MAE     
##   117040.4  0.7851128  82314.91
## 
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(mlr2)
## 
## Call:
## lm(formula = .outcome ~ ., data = dat)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1093968   -57642     4889    55915   816243 
## 
## Coefficients:
##                                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                       1.601e+06  1.238e+05  12.932  < 2e-16 ***
## ZIPCODE2109                      -1.199e+05  6.207e+03 -19.312  < 2e-16 ***
## ZIPCODE2110                      -8.627e+04  6.463e+03 -13.349  < 2e-16 ***
## ZIPCODE2111                      -1.505e+05  5.858e+03 -25.693  < 2e-16 ***
## ZIPCODE2113                      -1.634e+05  5.943e+03 -27.498  < 2e-16 ***
## ZIPCODE2114                      -1.253e+05  5.262e+03 -23.810  < 2e-16 ***
## ZIPCODE2115                      -1.137e+05  5.244e+03 -21.676  < 2e-16 ***
## ZIPCODE2116                      -2.504e+04  5.024e+03  -4.984 6.26e-07 ***
## ZIPCODE2118                      -8.073e+04  5.137e+03 -15.714  < 2e-16 ***
## ZIPCODE2119                      -4.986e+05  6.258e+03 -79.679  < 2e-16 ***
## ZIPCODE2120                      -4.365e+05  7.702e+03 -56.670  < 2e-16 ***
## ZIPCODE2121                      -5.509e+05  7.858e+03 -70.100  < 2e-16 ***
## ZIPCODE2122                      -5.096e+05  5.849e+03 -87.120  < 2e-16 ***
## ZIPCODE2124                      -5.127e+05  5.862e+03 -87.455  < 2e-16 ***
## ZIPCODE2125                      -4.474e+05  5.639e+03 -79.340  < 2e-16 ***
## ZIPCODE2126                      -5.737e+05  8.748e+03 -65.581  < 2e-16 ***
## ZIPCODE2127                      -2.642e+05  5.038e+03 -52.446  < 2e-16 ***
## ZIPCODE2128                      -4.144e+05  5.554e+03 -74.606  < 2e-16 ***
## ZIPCODE2129                      -2.256e+05  5.229e+03 -43.152  < 2e-16 ***
## ZIPCODE2130                      -3.856e+05  5.190e+03 -74.292  < 2e-16 ***
## ZIPCODE2131                      -5.252e+05  5.722e+03 -91.787  < 2e-16 ***
## ZIPCODE2132                      -5.053e+05  6.060e+03 -83.384  < 2e-16 ***
## ZIPCODE2134                      -3.519e+05  5.504e+03 -63.930  < 2e-16 ***
## ZIPCODE2135                      -3.483e+05  5.090e+03 -68.427  < 2e-16 ***
## ZIPCODE2136                      -5.968e+05  6.638e+03 -89.905  < 2e-16 ***
## ZIPCODE2199                      -3.465e+05  1.170e+05  -2.963 0.003053 ** 
## ZIPCODE2210                      -1.043e+05  7.424e+03 -14.055  < 2e-16 ***
## ZIPCODE2215                      -1.493e+05  5.321e+03 -28.062  < 2e-16 ***
## ZIPCODE2446                      -5.773e+05  8.279e+04  -6.973 3.15e-12 ***
## ZIPCODE2467                      -4.399e+05  6.777e+03 -64.914  < 2e-16 ***
## OWN_OCCY                          6.345e+03  1.096e+03   5.788 7.15e-09 ***
## YR_BUILT                         -3.458e+02  1.698e+01 -20.357  < 2e-16 ***
## YR_REMOD                          5.876e+00  7.475e-01   7.861 3.89e-15 ***
## GROSS_AREA                        2.561e+02  2.218e+00 115.503  < 2e-16 ***
## NUM_FLOORS                       -5.008e+04  1.304e+03 -38.412  < 2e-16 ***
## U_BASE_FLOOR                     -7.126e+02  2.173e+02  -3.279 0.001041 ** 
## U_NUM_PARK                        2.265e+04  9.612e+02  23.563  < 2e-16 ***
## `U_CORNERY - Yes`                 2.158e+04  1.440e+03  14.994  < 2e-16 ***
## `U_ORIENTB - Rear Below`         -3.738e+04  4.100e+03  -9.116  < 2e-16 ***
## `U_ORIENTC - Courtyard`          -1.203e+04  3.404e+03  -3.535 0.000409 ***
## `U_ORIENTE - End`                -3.190e+04  1.131e+04  -2.821 0.004797 ** 
## `U_ORIENTF - Front/Street`        3.466e+03  1.703e+03   2.035 0.041857 *  
## `U_ORIENTM - Middle`              1.009e+04  3.093e+03   3.263 0.001102 ** 
## `U_ORIENTT - Through`            -5.735e+03  1.725e+03  -3.325 0.000885 ***
## U_BDRMS                           1.590e+04  9.447e+02  16.836  < 2e-16 ***
## U_FULL_BTH                        1.378e+04  3.648e+03   3.778 0.000158 ***
## U_HALF_BTH                        2.857e+03  3.862e+03   0.740 0.459415    
## `U_BTH_STYLEM - Modern`          -6.018e+04  4.606e+03 -13.068  < 2e-16 ***
## `U_BTH_STYLEN - No Remodeling`   -7.193e+04  7.009e+03 -10.263  < 2e-16 ***
## `U_BTH_STYLES - Semi-Modern`     -6.394e+04  5.052e+03 -12.656  < 2e-16 ***
## `U_BTH_STYLE2M - Modern`         -5.547e+04  6.879e+03  -8.063 7.60e-16 ***
## `U_BTH_STYLE2N - No Remodeling`  -2.452e+04  1.147e+04  -2.137 0.032618 *  
## U_BTH_STYLE2No                   -1.403e+05  7.694e+03 -18.238  < 2e-16 ***
## `U_BTH_STYLE2S - Semi-Modern`    -9.532e+04  7.108e+03 -13.410  < 2e-16 ***
## `U_BTH_STYLE3M - Modern`          1.433e+04  2.209e+04   0.649 0.516554    
## `U_BTH_STYLE3N - No Remodeling`   3.426e+04  3.291e+04   1.041 0.297865    
## U_BTH_STYLE3No                   -1.071e+04  2.223e+04  -0.482 0.629993    
## `U_BTH_STYLE3S - Semi-Modern`    -7.687e+03  2.243e+04  -0.343 0.731823    
## `U_KITCH_TYPEF - Full Eat In`    -9.056e+04  1.169e+05  -0.775 0.438574    
## `U_KITCH_TYPEN - None`           -1.199e+05  1.176e+05  -1.019 0.308001    
## `U_KITCH_TYPEO - One Person`     -1.014e+05  1.169e+05  -0.867 0.385824    
## `U_KITCH_TYPEP - Pullman`        -1.705e+05  1.169e+05  -1.458 0.144735    
## `U_KITCH_STYLEM - Modern`        -4.656e+04  3.921e+03 -11.874  < 2e-16 ***
## `U_KITCH_STYLEN - No Remodeling` -7.556e+04  6.622e+03 -11.410  < 2e-16 ***
## `U_KITCH_STYLES - Semi-Modern`   -7.746e+04  4.419e+03 -17.531  < 2e-16 ***
## `U_HEAT_TYPF - Forced Hot Air`    1.491e+03  2.257e+03   0.661 0.508833    
## `U_HEAT_TYPN - None`             -2.320e+05  8.395e+04  -2.763 0.005730 ** 
## `U_HEAT_TYPO - Other`             2.316e+04  3.384e+04   0.684 0.493853    
## `U_HEAT_TYPP - Heat Pump`         3.164e+04  3.091e+03  10.235  < 2e-16 ***
## `U_HEAT_TYPS - Space Heat`        5.957e+03  1.323e+04   0.450 0.652598    
## `U_HEAT_TYPW - Ht Water/Steam`   -2.768e+03  2.070e+03  -1.337 0.181182    
## `U_ACD - Ductless AC`             2.871e+04  8.393e+03   3.421 0.000624 ***
## `U_ACN - None`                   -1.856e+04  1.544e+03 -12.024  < 2e-16 ***
## U_FPLACE                          4.273e+04  1.167e+03  36.624  < 2e-16 ***
## `U_INT_FING - Good`              -7.204e+04  4.798e+04  -1.501 0.133261    
## `U_INT_FINN - Normal`            -6.168e+04  4.207e+03 -14.663  < 2e-16 ***
## `U_INT_FINS - Substandard`       -9.660e+04  1.747e+04  -5.529 3.23e-08 ***
## `U_INT_CNDE - Excellent`          9.353e+04  2.701e+03  34.631  < 2e-16 ***
## `U_INT_CNDF - Fair`              -1.069e+04  8.703e+03  -1.228 0.219351    
## `U_INT_CNDG - Good`               2.808e+04  1.526e+03  18.406  < 2e-16 ***
## `U_INT_CNDP - Poor`              -8.698e+04  2.526e+04  -3.444 0.000573 ***
## `U_VIEWE - Excellent`             1.305e+05  3.771e+03  34.606  < 2e-16 ***
## `U_VIEWF - Fair`                 -2.880e+04  2.560e+03 -11.250  < 2e-16 ***
## `U_VIEWG - Good`                  3.563e+04  1.700e+03  20.962  < 2e-16 ***
## `U_VIEWP - Poor`                 -1.008e+05  9.216e+03 -10.942  < 2e-16 ***
## `U_VIEWS - Special`               4.306e+05  5.856e+04   7.353 1.97e-13 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 116800 on 53265 degrees of freedom
## Multiple R-squared:  0.7862, Adjusted R-squared:  0.7858 
## F-statistic:  2304 on 85 and 53265 DF,  p-value: < 2.2e-16
predict_mlr2 <- predict(mlr2, test1)
#Calculate the performance metrics for the prediction on testing set
# R2 is R-Squared. It measures how much the variance in the dependent variable which is explained by the model
# RMSE is root mean squared error. It measures the standard deviation of residuals
#MAE is Mean absolute error, the average of the absolute difference between the actual and predicted values in the dataset. It measures the average of the residuals in the dataset
data.frame( R2_2_1 = R2(predict_mlr2, test1$AV_BLDG),
            RMSE_2_1 = RMSE(predict_mlr2, test1$AV_BLDG),
            MAE_2_1 = MAE(predict_mlr2, test1$AV_BLDG))
##      R2_2_1 RMSE_2_1  MAE_2_1
## 1 0.6933966 523067.3 178474.2
x= 1:length(test1$AV_BLDG)

plot(x, test1$AV_BLDG, col="red", ylab = "Value", type="p",main="Boston Condo Property Assessment - Actual Value")

plot(x, predict_mlr2, col='blue', ylab = "Value", type="p", main="Boston Condo Property Assessment - Predicted Value")

plot(x, test1$AV_BLDG, col="red", ylab = "Value", type="p",main="Boston Condo Property Assessment")
par(new=T)
plot(x, predict_mlr2, col='blue', ylab = "", type="p")
legend("topright", legend=c("Actual Value", "Predicted Value"), fill = c("red","blue"), col=2:3, adj=c(0,0.6))
grid()

plot(predict_mlr2, test1$AV_BLDG, xlab = "preidtced value", ylab = "actual value")
abline(a=0,b=1)

#random forest with 5 trees
startime <- Sys.time()
control2_2 <- trainControl(method="cv", number = 10, search = "grid")
rf2_1 <- train(AV_BLDG~., data = train2, method ="rf", ntree=5, trControl=control2_2)
endtime <- Sys.time()
endtime - startime
## Time difference of 18.88791 mins
rf2_1
## Random Forest 
## 
## 53351 samples
##    24 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 48017, 48014, 48016, 48017, 48015, 48016, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared   MAE      
##    2    170813.96  0.6033097  127967.04
##   43     87504.65  0.8800031   50908.86
##   85     87696.11  0.8792837   50002.42
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 43.
plot(rf2_1)

#preidct on test set
predict_rf2_1 <- predict(rf2_1, test1)
#Show performance metrics on test set
data.frame( R2_2_2 = R2(predict_rf2_1, test1$AV_BLDG),
            RMSE_2_2 = RMSE(predict_rf2_1, test1$AV_BLDG),
            MAE_2_2 = MAE(predict_rf2_1, test1$AV_BLDG))
##      R2_2_2 RMSE_2_2  MAE_2_2
## 1 0.4714317 601895.9 165442.7
#visualize the results of actual value and predicted value
x= 1:length(test1$AV_BLDG)

plot(x, test1$AV_BLDG, col="red", ylab = "Value", type="p",main="Boston Condo Property Assessment - Actual Value")

plot(x, predict_rf2_1, col='blue', ylab = "Value", type="p", main="Boston Condo Property Assessment - Predicted Value")

plot(x, test1$AV_BLDG, col="red", ylab = "Value", type="p",main="Boston Condo Property Assessment")
par(new=T)
plot(x, predict_rf2_1, col='blue', ylab = "", type="p")
legend("topright", legend=c("Actual Value", "Predicted Value"), fill = c("red","blue"), col=2:3, adj=c(0,0.6))
grid()

#Visualize how close predicted value and actual value are
plot(predict_rf2_1, test1$AV_BLDG, xlab = "preidtced value", ylab = "actual value")
abline(a=0,b=1)

#random forest with 10 trees
startime <- Sys.time()
rf2_2 <- train(AV_BLDG~., data = train2, method ="rf", ntree=10, trControl=control2_2)
endtime <- Sys.time()
endtime - startime
## Time difference of 37.5271 mins
rf2_2
## Random Forest 
## 
## 53351 samples
##    24 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 48016, 48016, 48016, 48017, 48015, 48016, ... 
## Resampling results across tuning parameters:
## 
##   mtry  RMSE       Rsquared   MAE      
##    2    164450.73  0.6603781  123552.10
##   43     82185.58  0.8947526   47315.97
##   85     83148.29  0.8917390   46983.77
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was mtry = 43.
plot(rf2_2)

predict_rf2_2 <- predict(rf2_2, test1)
data.frame( R2_2_3 = R2(predict_rf2_2, test1$AV_BLDG),
            RMSE_2_3 = RMSE(predict_rf2_2, test1$AV_BLDG),
            MAE_2_3 = MAE(predict_rf2_2, test1$AV_BLDG))
##      R2_2_3 RMSE_2_3  MAE_2_3
## 1 0.4378564 613487.3 165277.6
#visualize the results of actual value and predicted value
x= 1:length(test1$AV_BLDG)

plot(x, test1$AV_BLDG, col="red", ylab = "Value", type="p",main="Boston Condo Property Assessment - Actual Value")

plot(x, predict_rf2_2, col='blue', ylab = "Value", type="p", main="Boston Condo Property Assessment - Predicted Value")

plot(x, test1$AV_BLDG, col="red", ylab = "Value", type="p",main="Boston Condo Property Assessment")
par(new=T)
plot(x, predict_rf2_2, col='blue', ylab = "", type="p")
legend("topright", legend=c("Actual Value", "Predicted Value"), fill = c("red","blue"), col=2:3, adj=c(0,0.6))
grid()

#Visualize how close predicted value and actual value are
plot(predict_rf2_2, test1$AV_BLDG, xlab = "preidtced value", ylab = "actual value")
abline(a=0,b=1)

#prepare data for k nearest neighbor regression
x_train2 = train2
x_test2 = test1
#onehot encoding
dmy1 <- dummyVars(" ~ .", data = x_train2)
x_train2 <- data.frame(predict(dmy1, newdata = x_train2))
dmy2 <- dummyVars(" ~ .", data = x_test2)
x_test2 <- data.frame(predict(dmy2, newdata = x_test2))
#plug 0 value into attributes missing from test set after one hot encoding
x_test2$U_HEAT_TYPN...None <- 0
x_test2 <- x_test1 %>% relocate(U_HEAT_TYPN...None, .after = U_HEAT_TYPF...Forced.Hot.Air)
#k nearest regression
startime <- Sys.time()
knn2 <- train(AV_BLDG~., data = x_train1, method ="knn", trControl=control2_1, tuneGrid = expand.grid(k=1:2))
endtime <- Sys.time()
endtime - startime
## Time difference of 41.88771 mins
knn2
## k-Nearest Neighbors 
## 
## 58678 samples
##    99 predictor
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 52810, 52810, 52810, 52810, 52811, 52810, ... 
## Resampling results across tuning parameters:
## 
##   k  RMSE      Rsquared   MAE     
##   1  522472.0  0.5915549  184392.6
##   2  467377.5  0.6420443  185586.6
## 
## RMSE was used to select the optimal model using the smallest value.
## The final value used for the model was k = 2.
predict_knn2 <- predict(knn2, x_test2)
data.frame( R2_2_4 = R2(predict_knn2, x_test2$AV_BLDG),
            RMSE_2_4 = RMSE(predict_knn2, x_test2$AV_BLDG),
            MAE_2_4 = MAE(predict_knn2, x_test2$AV_BLDG))
##      R2_2_4 RMSE_2_4  MAE_2_4
## 1 0.6155162   478633 184474.3
x= 1:length(x_test2$AV_BLDG)

plot(x, x_test2$AV_BLDG, col="red", ylab = "Value", type="p",main="Boston Condo Property Assessment - Actual Value")

plot(x, predict_knn2, col='blue', ylab = "Value", type="p", main="Boston Condo Property Assessment - Predicted Value")

plot(x, x_test2$AV_BLDG, col="red", ylab = "Value", type="p",main="Boston Condo Property Assessment")
par(new=T)
plot(x, predict_knn2, col='blue', ylab = "", type="p")
legend("topright", legend=c("Actual Value", "Predicted Value"), fill = c("red","blue"), col=2:3, adj=c(0,0.6))
grid()

plot(predict_knn2, x_test2$AV_BLDG, xlab = "preidtced value", ylab = "actual value")
abline(a=0,b=1)